package ar.uba.fi.taller2.ftrs.parser;

import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.hwpf.*;
import org.apache.poi.hwpf.extractor.*;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;

import java.io.*;

public class WordParser extends Parser {
	private String fileName;
	
	public WordParser(String name) {	
		//Librerias para hacer extraccion de texto de archivos DOC en JAVA
		//    * http://schmidt.devlib.org/java/libraries-word.html		
		this.fileName = name;
	}
	
	public void parsear() throws Exception{
		this.parsear(this.defaultLanguage);
	}
	
	public void parsear(String lang)throws Exception{
		POIFSFileSystem fs = null;
		fs = new POIFSFileSystem(new FileInputStream(this.fileName)); 
        HWPFDocument doc = new HWPFDocument(fs);
        WordExtractor we = new WordExtractor(doc);
        String[] paragraphs = we.getParagraphText();
        String text=new String("");
  		for( int i=0; i<paragraphs .length; i++ ) {
			paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n","");
            text+=" "+paragraphs[i];
		 }		
		super.parsear(lang, text);			
	}	
	
	public static void main(String args[]){
		 WordParser word = new WordParser("/tmp/test.doc");
		 try{
		 word.parsear();
		 }
		 catch(Exception e){
			 System.out.println(e);
		 }
	}
}
